home *** CD-ROM | disk | FTP | other *** search
Text File | 2000-10-06 | 13.1 KB | 646 lines | [TEXT/CWIE] |
- ///--------------------------------------------------------------------------------------
- // BlitPixieRect - a fast software blitter
- //
- // Based on (but completely rewritten) the old BlitPixie routines by :
- // Tony Myles, Brian Roddy, Christofer Åkersten, Tom Bishop,
- // Ben Sharpe, Brigham Stevens, Sean Callahan, Joe Britt and Tim Collins
- //
- // written by Anders F Björklund <afb@algonet.se>
- // © 1999 afb.
- ///--------------------------------------------------------------------------------------
-
- #ifndef __BLITPIXIE__
- #include "BlitPixieHeader.h"
- #endif
-
- #include "BlitPixieAsm.h"
-
- #define USE_GENERIC_CACHE 0 // whether to even compile the PPC cache instructions ?
-
- #if USE_GENERIC_CACHE
- Boolean gBPCacheable = false; // destination memory cachable ? (that is: not in VRAM)
- #endif
-
- #pragma mark *** PowerPC asm:
- #if USE_PPC_ASSEMBLY
-
- ///--------------------------------------------------------------------------------------
- // BlitPixieRect
- ///--------------------------------------------------------------------------------------
-
- // NOTE: assumes bytes, rows > 0 (otherwise overflow)
-
- // NOTE: assumes srcRowBytes & dstRowBytes are multiples of 4 (for alignment purposes)
-
- // the PPC blitter is actually 3 in 1:
- // • one using doubles, for wordaligned blits
- // (this one is always used on PPC 601, which can blit unaligned doubles too)
- // • one using words, for unaligned blits
- // (un-wordaligned doubles causes alignment exceptions on everything but 601)
- // • one for narrow strips, up to 32 bytes wide
-
- ASM_FUNC void BlitPixieRect(
- register unsigned char *srcPixelPTemp, // r3
- register unsigned char *dstPixelPTemp, // r4
- register unsigned long srcOffset, // r5
- register unsigned long dstOffset, // r6
- register unsigned short numBytesPerRow, // r7
- register unsigned short rowsToCopy) // r8
- {
- #define r_srcRowBytes r5
- #define r_dstRowBytes r6
- #define r_width r7
- #define r_height r8
-
- #define r_dstStride r31
- #define r_srcStride r30
- #define r_bytes r29
- #define r_blocks r28
- #define r_y r27
-
- ASM_BEGIN
-
- cmplwi r_width,32
- ble @small
-
- stw r27,-20(SP)
- stw r28,-16(SP)
- stw r29,-12(SP)
- stw r30,-8(SP)
- stw r31,-4(SP)
-
-
- lwz r9,gBlitPixieProcessorType(RTOC)
- #if USE_GENERIC_CACHE
- lwz r10,gCacheable(RTOC)
- #endif
-
- mr r_bytes,r_width
- mr r_y,r_height
-
- lha r9,0(r9)
- #if USE_GENERIC_CACHE
- lbz r10,0(r10)
- #endif
-
- sub r_srcStride,r_srcRowBytes,r_width
- sub r_dstStride,r_dstRowBytes,r_width
-
- cmplwi cr5,r9,k601 // PPC 601?
- #if USE_GENERIC_CACHE
- cmplwi cr1,r10,0 // cacheable?
- #endif
-
- rlwinm r9,r3,0,30,31 // src & 3
- rlwinm r10,r4,0,30,31 // dst & 3
- cmplw cr6,r9,r10 // aligned?
-
- bne cr5,@aligned
- bne cr6,@unaligned
-
- @aligned:
-
- // ALIGNED/DOUBLES ------------------------------------------------------------------
-
- neg r0,r4
- rlwinm r0,r0,0,27,31
- sub r_bytes,r_bytes,r0
-
- subi r3,r3,8
- subi r4,r4,8
-
- rlwinm. r_blocks,r_bytes,27,5,31
- rlwinm r_bytes,r_bytes,0,27,31
-
- #define FLAG_BLOCKS 20
- // #define FLAG_FREE 21 // note: free for use
-
- #define FLAG_PRE1 26
- #define FLAG_PRE2 25
- #define FLAG_PRE4 24
- #define FLAG_PRE8 23
- #define FLAG_PRE16 22
-
- #define FLAG_POST1 31
- #define FLAG_POST2 30
- #define FLAG_POST4 29
- #define FLAG_POST8 28
- #define FLAG_POST16 27
-
- rlwinm r0,r0,27-FLAG_PRE16,FLAG_PRE16,FLAG_PRE1
- rlwimi r0,r_bytes,27-FLAG_POST16,FLAG_POST16,FLAG_POST1
- mtcrf 0x07,r0 // cr5 | cr6 | cr7
- crnor FLAG_BLOCKS,0*CR_NO + CR_EQ,0*CR_NO + CR_EQ
-
- @alignrowloop:
-
- mtctr r_blocks
-
- // copy pre-block
- bc IF_NOT,FLAG_PRE1,@skip_pre1
-
- lbz r0,8(r3)
- addi r3,r3,1
- stb r0,8(r4)
- addi r4,r4,1
-
- @skip_pre1:
- bc IF_NOT,FLAG_PRE2,@skip_pre2
-
- lhz r0,8(r3)
- addi r3,r3,2
- sth r0,8(r4)
- addi r4,r4,2
-
- @skip_pre2:
- bc IF_NOT,FLAG_PRE4,@skip_pre4
-
- lwz r0,8(r3)
- addi r3,r3,4
- stw r0,8(r4)
- addi r4,r4,4
-
- @skip_pre4:
- bc IF_NOT,FLAG_PRE8,@skip_pre8
-
- lfd fp0,8(r3)
- addi r3,r3,8
- stfd fp0,8(r4)
- addi r4,r4,8
-
- @skip_pre8:
- bc IF_NOT,FLAG_PRE16,@skip_pre16
-
- lfd fp1,8(r3)
- lfd fp2,16(r3)
- addi r3,r3,16
- stfd fp1,8(r4)
- stfd fp2,16(r4)
- addi r4,r4,16
-
- @skip_pre16:
-
- // copy blocks
- bc IF_NOT,FLAG_BLOCKS,@skipalignblockloop
- li r0,8
-
- @alignblockloop:
- lfd fp1,8(r3)
- lfd fp2,16(r3)
- lfd fp3,24(r3)
- lfdu fp4,32(r3)
-
- #if USE_GENERIC_CACHE
- beq cr1,@no_cache1
- dcbz r4,r0
- dcbt r3,r0
- @no_cache1:
- #endif
-
- stfd fp1,8(r4)
- stfd fp2,16(r4)
- stfd fp3,24(r4)
- stfdu fp4,32(r4)
-
- bdnz @alignblockloop
- @skipalignblockloop:
-
- subic. r_y,r_y,1
-
- // copy post-block
- bc IF_NOT,FLAG_POST16,@skip_post16
-
- lfd fp1,8(r3)
- lfd fp2,16(r3)
- addi r3,r3,16
- stfd fp1,8(r4)
- stfd fp2,16(r4)
- addi r4,r4,16
-
- @skip_post16:
- bc IF_NOT,FLAG_POST8,@skip_post8
-
- lfd fp0,8(r3)
- addi r3,r3,8
- stfd fp0,8(r4)
- addi r4,r4,8
-
- @skip_post8:
- bc IF_NOT,FLAG_POST4,@skip_post4
-
- lwz r0,8(r3)
- addi r3,r3,4
- stw r0,8(r4)
- addi r4,r4,4
-
- @skip_post4:
- bc IF_NOT,FLAG_POST2,@skip_post2
-
- lhz r0,8(r3)
- addi r3,r3,2
- sth r0,8(r4)
- addi r4,r4,2
-
- @skip_post2:
- bc IF_NOT,FLAG_POST1,@skip_post1
-
- lbz r0,8(r3)
- addi r3,r3,1
- stb r0,8(r4)
- addi r4,r4,1
-
- @skip_post1:
-
- add r3,r3,r_srcStride
- add r4,r4,r_dstStride
-
- bne @alignrowloop
-
- b @end
-
- @unaligned:
-
- // UNALIGNED ------------------------------------------------------------------------
-
- rlwinm r_blocks,r_bytes,27,5,31
- rlwinm r_bytes,r_bytes,0,27,31
- li r0,32
-
- cmplwi cr5,r_blocks,0
- cmplwi cr6,r_bytes,0
- mtxer r_bytes
-
- subi r3,r3,32
- subi r4,r4,32
-
- @unalignrowloop:
-
- beq cr5,@skipunalignblockloop
- mtctr r_blocks
-
- @unalignblockloop:
- lwzu r5,32(r3)
- lwz r6,4(r3)
- lwz r7,8(r3)
- lwz r8,12(r3)
- lwz r9,16(r3)
- lwz r10,20(r3)
- lwz r11,24(r3)
- lwz r12,28(r3)
-
- #if USE_GENERIC_CACHE
- beq cr1,@no_cache2
- dcbz r4,r0
- dcbt r3,r0
- @no_cache2:
- #endif
-
- stwu r5,32(r4)
- stw r6,4(r4)
- stw r7,8(r4)
- stw r8,12(r4)
- stw r9,16(r4)
- stw r10,20(r4)
- stw r11,24(r4)
- stw r12,28(r4)
- bdnz @unalignblockloop
- @skipunalignblockloop:
-
- subic. r_y,r_y,1
-
- beq cr6,@skipunalignleftover
- lswx r5,r3,r0
- add r3,r3,r_bytes
- stswx r5,r4,r0
- add r4,r4,r_bytes
- @skipunalignleftover:
-
- add r3,r3,r_srcStride
- add r4,r4,r_dstStride
-
- bne @unalignrowloop
-
- @end:
-
- // END ------------------------------------------------------------------------------
-
- lwz r27,-20(SP)
- lwz r28,-16(SP)
- lwz r29,-12(SP)
- lwz r30,-8(SP)
- lwz r31,-4(SP)
- blr
-
- @small:
-
- // SMALL ----------------------------------------------------------------------------
-
- mtxer r_width
- mtctr r_height
-
- stw r_srcStride,-8(SP)
- stw r_dstStride,-4(SP)
-
- mr r_srcStride,r_srcRowBytes
- mr r_dstStride,r_dstRowBytes
-
- @smallrowloop:
-
- lswx r5,r0,r3
- add r3,r3,r_srcStride
- stswx r5,r0,r4
- add r4,r4,r_dstStride
-
- bdnz @smallrowloop
-
- lwz r_srcStride,-8(SP)
- lwz r_dstStride,-4(SP)
-
- ASM_END
- }
-
- #pragma mark *** 680X0 asm:
- #elif USE_68K_ASSEMBLY
-
- ///--------------------------------------------------------------------------------------
- // BlitPixieRect
- ///--------------------------------------------------------------------------------------
-
- // NOTE: assumes bytes, rows > 0 (otherwise overflow)
-
- #define SMALL 32 // maximum byte width for small blits
-
- // the 68k blitter is actually 3 in 1 :
- // • if on a '040 and properly aligned, it uses MOVE16 instructions
- // otherwise it uses normal MOVE.L instructions:
- // • a main blitter, which uses "Duff's Device" to unroll the loops
- // • a simplified blitter, with less setup overhead for small blits
-
- ASM_FUNC void BlitPixieRect(
- unsigned char *srcPixelP,
- unsigned char *dstPixelP,
- unsigned long srcOffset,
- unsigned long dstOffset,
- unsigned short numBytesPerRow,
- unsigned short rowsToCopy)
- {
- #define D_srcBytes D2
- #define D_dstBytes D3
- #define D_bytes D4
- #define D_y D5
- #define D_align D6
-
- ASM_BEGIN
-
- MOVEM.L D3-D6/A2,-(SP)
-
- MOVEM.L srcPixelP,A0-A1
- MOVEM.L srcOffset,D2-D3
- MOVEM.W numBytesPerRow,D4-D5
-
- EXT.L D_bytes
- CMPI.W ASM_NUM(SMALL),D_bytes
- BCS @small // big enough ?
-
- // check for proper processor
- CMP.W #k68040,gBlitPixieProcessorType
- BNE @not68040
-
- // check for proper alignment
- MOVE.W A0,D0
- MOVE.W A1,D1
- ANDI.W #0x0F,D0
- ANDI.W #0x0F,D1
- CMP.W D0,D1
- BNE @not68040 // aligned to same 16-bit ?
- OR.W D1,D0
- ANDI.W #0x03,D0
- BNE @not68040 // aligned to word ?
- MOVE.W D_srcBytes,D0
- OR.W D_dstBytes,D0
- ANDI.W #0x0F,D0
- BNE @not68040 // rowBytes aligned to 16-bit ?
- MOVE.W D_bytes,D0
- ANDI.W #0x03,D0
- BNE @not68040 // whole words ?
-
- @do68040:
-
- // MOVE16 ---------------------------------------------------------------------------
-
- MOVEQ #15,D0
- CLR.L D1
-
- SUB.L D_bytes,D_srcBytes
- SUB.L D_bytes,D_dstBytes
-
- MOVE.W A1,D_align // align words (0-3)
- NEG.W D_align
- AND.W D0,D_align
- SUB.W D_align,D_bytes
- LSR.W #2,D_align
-
- MOVE.W D_bytes,D1
- LSR.W #4,D1
- AND.W D0,D1
- LSL.W #2,D1 // * sizeof("MOVE16 (A0)+,(A1)+")
- LEA @loopend16,A2
- SUBA.L D1,A2
- MOVE.W D_bytes,D1
- LSR.W #8,D1
-
- LSR.W #2,D_bytes // leftover words (0-3)
- ANDI.W #3,D_bytes
-
- @rowloop16:
-
- // aligning loop
- MOVE.W D_align,D0
- BRA.S @prewordloopend16
- @prewordloopstart16:
- MOVE.L (A0)+,(A1)+
- @prewordloopend16:
- DBRA D0,@prewordloopstart16
-
- // main block copy loop
- MOVE.W D1,D0
- JMP (A2)
- @loopstart16:
- /* MOVE16 (A0)+,(A1)+ */ DC.L 0xF6209000
- /* MOVE16 (A0)+,(A1)+ */ DC.L 0xF6209000
- /* MOVE16 (A0)+,(A1)+ */ DC.L 0xF6209000
- /* MOVE16 (A0)+,(A1)+ */ DC.L 0xF6209000
- /* MOVE16 (A0)+,(A1)+ */ DC.L 0xF6209000
- /* MOVE16 (A0)+,(A1)+ */ DC.L 0xF6209000
- /* MOVE16 (A0)+,(A1)+ */ DC.L 0xF6209000
- /* MOVE16 (A0)+,(A1)+ */ DC.L 0xF6209000
- /* MOVE16 (A0)+,(A1)+ */ DC.L 0xF6209000
- /* MOVE16 (A0)+,(A1)+ */ DC.L 0xF6209000
- /* MOVE16 (A0)+,(A1)+ */ DC.L 0xF6209000
- /* MOVE16 (A0)+,(A1)+ */ DC.L 0xF6209000
- /* MOVE16 (A0)+,(A1)+ */ DC.L 0xF6209000
- /* MOVE16 (A0)+,(A1)+ */ DC.L 0xF6209000
- /* MOVE16 (A0)+,(A1)+ */ DC.L 0xF6209000
- /* MOVE16 (A0)+,(A1)+ */ DC.L 0xF6209000
- @loopend16:
- DBRA D0,@loopstart16
-
- // left-over words loop
- MOVE.W D_bytes,D0
- BRA.S @postwordloopend16
- @postwordloopstart16:
- MOVE.L (A0)+,(A1)+
- @postwordloopend16:
- DBRA D0,@postwordloopstart16
-
- ADDA.L D_srcBytes,A0
- ADDA.L D_dstBytes,A1
-
- SUBQ.W #1,D_y
- BNE.S @rowloop16
-
- BRA @end
-
- @not68040:
-
- // MOVE.L ---------------------------------------------------------------------------
-
- MOVEQ #15,D0
- CLR.L D1
-
- SUB.L D_bytes,D_srcBytes
- SUB.L D_bytes,D_dstBytes
-
- MOVE.W D_bytes,D1
- LSR.W #2,D1
- AND.W D0,D1
- ADD.W D1,D1 // * sizeof("MOVE.L (A0)+,(A1)+")
- LEA @loopend,A2
- SUBA.L D1,A2
- MOVE.W D_bytes,D1
- LSR.W #6,D1
-
- MOVE.W D_bytes,D_align
- ANDI.W #2,D_align
- ANDI.W #1,D_bytes
-
- @rowloop:
-
- // main block copy loop
- MOVE.W D1,D0
- JMP (A2)
- @loopstart:
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- MOVE.L (A0)+,(A1)+
- @loopend:
- DBRA D0,@loopstart
-
- // do left-over bytes
- TST.W D_align
- BEQ.S @skipword
- MOVE.W (A0)+,(A1)+
- @skipword:
- TST.W D_bytes
- BEQ.S @skipbyte
- MOVE.B (A0)+,(A1)+
- @skipbyte:
-
- ADDA.L D_srcBytes,A0
- ADDA.L D_dstBytes,A1
-
- SUBQ.W #1,D_y
- BNE.S @rowloop
-
- BRA @end
-
- @small:
-
- // SMALL ----------------------------------------------------------------------------
-
- SUB.L D_bytes,D_srcBytes
- SUB.L D_bytes,D_dstBytes
-
- MOVE.W D_bytes,D1
- LSR.W #2,D1
- MOVE.W D_bytes,D_align
- ANDI.W #2,D_align
- ANDI.W #1,D_bytes
-
- @smallrowloop:
-
- // words loop
- MOVE.W D1,D0
- BRA.S @smallwordloopend
- @smallwordloopstart:
- MOVE.L (A0)+,(A1)+
- @smallwordloopend:
- DBRA D0,@smallwordloopstart
-
- // do left-over bytes
- TST.W D_align
- BEQ.S @smallskipword
- MOVE.W (A0)+,(A1)+
- @smallskipword:
- TST.W D_bytes
- BEQ.S @smallskipbyte
- MOVE.B (A0)+,(A1)+
- @smallskipbyte:
-
- ADDA.L D_srcBytes,A0
- ADDA.L D_dstBytes,A1
-
- SUBQ.W #1,D_y
- BNE.S @smallrowloop
-
- @end:
-
- // END ------------------------------------------------------------------------------
-
- MOVEM.L (SP)+,D3-D6/A2
-
- ASM_END
- }
-
- #pragma mark *** Generic C:
- #elif USE_GENERIC_C
-
- ///--------------------------------------------------------------------------------------
- // BlitPixieRect
- ///--------------------------------------------------------------------------------------
-
- void BlitPixieRect(
- unsigned char *srcPixelP,
- unsigned char *dstPixelP,
- unsigned long srcOffset,
- unsigned long dstOffset,
- unsigned short numBytesPerRow,
- unsigned short rowsToCopy)
- {
- BLITPIXIE_ASSERT( numBytesPerRow > 0 );
- BLITPIXIE_ASSERT( rowsToCopy > 0 );
-
- while ( rowsToCopy-- )
- {
- BlitPixieMemCopy( dstPixelP, srcPixelP, numBytesPerRow );
- srcPixelP += srcOffset;
- dstPixelP += dstOffset;
- }
- }
-
- #endif
-
-